Here we show how embeddr (= spectral embedding + principal curves) can be used for pseudotemporal ordering of single-cell gene expression data using the monocle dataset. This uses the HSMMSingleCell dataset that is bundled with monocle.

library(monocle) ## for monocle data
library(devtools) ## for package development
library(reshape2) ## to melt data frames
library(plyr) 
library(dplyr) 
library(ggplot2)
library(ggthemes)
library(caret) ## for cross validation steps
library(scater) ## to hold single-cell data
library(knitr) ## for kable function
library(goseq) ## for GO enrichment
library(org.Hs.eg.db) ## for HG19 GO annotations
library(embeddr)
library(cowplot)

First we create the SCESet using the data from the HSMMSingleCell package:

## This is a bit fiddly since HSMMSingleCell changed format recently
sce <- NULL
hsmm_data_available <- data(package='HSMMSingleCell')$results[,3]
if("HSMM" %in% hsmm_data_available) {
  data(HSMM)
  sce <- fromCellDataSet(HSMM, use_exprs_as = 'fpkm')
} else if("HSMM_expr_matrix" %in% hsmm_data_available) {
  data(HSMM_expr_matrix)
  data(HSMM_gene_annotation)
  data(HSMM_sample_sheet)

  pd <- new('AnnotatedDataFrame', data = HSMM_sample_sheet)
  fd <- new('AnnotatedDataFrame', data = HSMM_gene_annotation)
  sce <- newSCESet(fpkmData = HSMM_expr_matrix, phenoData = pd, featureData = fd)
} else {
  stop('No recognised data types in HSMMSingleCell')
}

## add cell_id to HSMM to play nicely with dplyr
phenoData(sce)$cell_id <- rownames(pData(sce))

First we go through cleaning the monocle dataset and selecting for marker genes only:

## convert back to monocle
HSMM <- toCellDataSet(sce, use_as_exprs = "fpkm")
marker_genes <- row.names(subset(fData(HSMM),
                                 gene_short_name %in% c("MEF2C", "MEF2D", "MYF5", "ANPEP", "PDGFRA",
                  "MYOG", "TPM1", "TPM2", "MYH2", "MYH3", "NCAM1",
                  "CDK1", "CDK2", "CCNB1", "CCNB2", "CCND1", "CCNA")))
x <- log(exprs(HSMM[marker_genes,]) + 1)
x <- t(scale(t(x)))
# sce <- sce[marker_genes,]
# exprs(sce) <- x

Laplacian eigenmaps

The embeddr workflow creates the nearest neighbour graph then returns the laplacian eigenmap. We can do this using embeddr::embeddr with all options available, though embeddr::weighted_graph and embeddr::laplacian_eigenmap are available to perform each step by hand or with custom distance metrics. The default options specify a nearest neighbour graph with \(k = round(log(n))\) neighbours for \(n\) cells. Other options for creating the graph (such as distance measures and heat kernels) are also available.

sce <- embeddr(sce, genes_for_embedding = marker_genes)
plot_embedding(sce)
## Warning in plot_embedding(sce): color_by string cluster not found in
## pData(SCESet)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:ggplot2':
## 
##     ggsave

The function embeddr::plot_embedding can be used at any time on the appropriate data.frame objects and will display all relevant information. We can start by seeing where the different inferred states from the monocle dataset fall on our embedding:

HSMM <- setOrderingFilter(HSMM, marker_genes)
HSMM <- reduceDimension(HSMM, use_irlba = F)
## Reducing to independent components
HSMM <- orderCells(HSMM, num_paths=2, reverse=F)
plot_spanning_tree(HSMM)

phenoData(sce)$monocle_state <- pData(HSMM)$State
plot_embedding(sce, color_by = 'monocle_state')

So there appears to be reasonable correspondance between the monocle clusters and natural clusters in our data. The embeddr::cluster_embedding function applies k-means clustering to the dataset:

set.seed(123)
sce <- cluster_embedding(sce, k=3, method='mm')
plot_embedding(sce)

Selecting genes for the embedding

In standard manifold learning problems it is recommended that each feature is appropriately scaled to have mean 0 and variance 1. However, this is equivalent to treating all genes as equally contributing towards the process. Therefore it is recommended not to scale the dataset.

The entire transcriptome can be used to construct the embedding. However, it can be useful to pick only high-variance genes removing some of the residual noise from housekeeping or lowly expressed ones. The justification behind this is that the main source of variation in our dataset will be attributed to the process of interest. These high variance genes can be found using spike-ins (see Brennecke et al. Nature Methods 2014) or simlpy by fitting CV-mean curves and finding genes with a CV much higher than the mean:

x <- t(log10(exprs(HSMM) + 1))
x_mean <- colMeans(x)
x_var <- apply(x, 2, var)
genes_for_fit <- x_mean > 0.3
CV2 <- x_var[genes_for_fit] / (x_mean[genes_for_fit])^2
df_fit <- data.frame(m = x_mean[genes_for_fit], CV2 = CV2)
fit_loglin <- nls(CV2 ~ a * 10^(-k * m), data = df_fit, start=c(a=5, k=1)) 
ak <- coef(fit_loglin)
f <- function(x) ak[1] * 10^(-ak[2] * x) 
genes_for_embedding <- (CV2 > 4 * predict(fit_loglin))
df_fit$for_embedding <- as.factor(genes_for_embedding)
ggplot(df_fit, aes(x=m, y=CV2, color = for_embedding)) + geom_point() +
  theme_bw() + xlab('Mean') + ylab('CV2') + scale_color_fivethirtyeight() +
  stat_function(fun=f, color='black')

Next we take the log10 of the dataset (using a pseudocount of 1) and fit the embedding using the embeddr function using the default settings:

set.seed(123)
#sce <- fromCellDataSet(HSMM, use_exprs_as = "fpkm")

gene_indices <- match(names(which(genes_for_embedding)), featureNames(sce))
sce <- embeddr(sce, genes_for_embedding = gene_indices)

pData(sce)$long_state <- plyr::mapvalues(pData(sce)$State, from=1:3,
                                            to=c('Differentiating myoblast',
                                                 'Proliferating cell',
                                                 'Interstitial mesenchymal cell'))

plot_embedding(sce, color_by = 'long_state')

We can view the graph of cells to see how connections between different parts form:

plot_graph(sce)

We can also cluster the embedding using kmeans and plot:

sce <- cluster_embedding(sce, k = 3)

sce_tmp <- sce
phenoData(sce_tmp)$cluster <- plyr::mapvalues(pData(sce_tmp)$cluster, from=c(3, 1, 2),
                                            to=c(1,2,3))
phenoData(sce_tmp)$cluster <- plyr::mapvalues(pData(sce_tmp)$cluster, from=1:3,
                                            to=c('Differentiating myoblast',
                                                 'Proliferating cell',
                                                 'Interstitial mesenchymal cell'))

plot_embedding(sce_tmp)

Finally, we want to check the method is different to a more primitive technique such as PCA alone:

plotPCA(sce[gene_indices,], colour_by = 'long_state')

Pseudotime fitting

In the monocle paper they show that groups 1 & 3 correspond to differentiating cells while group 2 is contamination. We can separate off groups 1 & 3, fit the pseudotime trajectories and plot:

#save(sce, file='~/delete_me.Rdata')
sce_23 <- sce[, pData(sce)$cluster %in% c(1,2)]
sce_23 <- fit_pseudotime(sce_23)
plot_embedding(sce_23)

We can also compare our pseudotime with that of monocle:

in_state23 <- pData(sce_23)$cell_id
monocle_df <- filter(pData(HSMM), cell_id %in% in_state23)

qplot(arrange(monocle_df, cell_id)$Pseudotime, arrange(pData(sce_23), cell_id)$pseudotime) +
  theme_bw() + xlab('Monocle pseudotime') + ylab('embeddr pseudotime')

So there is good correspondence between the monocle and embeddr pseudotimes, though it appears that the embeddr version goes in the wrong direction. Pseudotimes are equivalent up to parity and scaling transformations (which is perhaps more philosophical than it sounds), so we can use the embeddr::reverse_pseudotime function to make the pseudotimes ‘run’ in the same direction:

sce_23 <- reverse_pseudotime(sce_23)
qplot(arrange(monocle_df, cell_id)$Pseudotime, arrange(pData(sce_23), cell_id)$pseudotime) +
  theme_bw() + xlab('Monocle pseudotime') + ylab('embeddr pseudotime')

sce_23 <- reverse_pseudotime(sce_23)

The overall correlation between the two pseudotime trajectories is -0.80, which is pretty good.

Plotting genes in pseudotime

To plot the genes in pseudotime we need to provide the original gene values for the cells in clusters 1 & 3:

#xp <- select(data.frame(t(x)), one_of(Mp$cell_id))
genes_to_plot <- row.names(subset(fData(HSMM), 
                                  gene_short_name %in% c("CDK1", "MEF2C", "MYH3", "MYOG","ID1")))
plot_in_pseudotime(sce_23[genes_to_plot,])

which is largely similar to the monocle equivalent.

Let’s have a look at the MRF family of transcription factors:

mrf <- c('MYOD1', 'MYF5', 'MYF6', 'MYOG') 
mrf_ind <- sapply(mrf, match, fData(sce_23)$gene_short_name)

plot_in_pseudotime(sce_23[mrf_ind,], use_short_names = TRUE)

Robustness of the embedding

Choice of nearest neighbours

nns <- c(5,6,8,10,15,20)
sce_npst <- sce
sce_npst$pseudotime <- NULL
embedding_plots <- lapply(nns, function(i) {
  sce_map <- embeddr(sce_npst, genes_for_embedding = gene_indices, nn = i)
  sce_map <- fit_pseudotime(sce_map, clusters=c(1, 2))
#   R <- dplyr::select(pData(sce_map), trajectory_1, trajectory_2, pseudotime)
#   R <- cbind(R, redDim(sce_map))
  plot_embedding(sce_map)
})

plot_grid(plotlist = embedding_plots, labels = as.character(nns))

Subsampling the number of cells

Subsample to check robustness:

ncv <- 8
inds <- createFolds(1:ncol(sce), ncv)

embedding_plots <- lapply(1:ncv, function(i) {
  sce_reduced <- sce[gene_indices ,-inds[[i]] ]
  sce_reduced <- embeddr(sce_reduced)
  sce_reduced <- fit_pseudotime(sce_reduced, clusters=c(1,2))
  return(plot_embedding(sce_reduced))
})

plot_grid(plotlist = embedding_plots[1:4])

plot_grid(plotlist = embedding_plots[5:8])

Try the same with 1/2 of the data:

inds_2 <- createMultiFolds(1:ncol(sce), k=2, times=6)

embeddings_2 <- lapply(1:length(inds_2), function(i) {
  sce_reduced <- sce[gene_indices ,-inds_2[[i]] ]
  sce_reduced <- embeddr(sce_reduced, nn = 5)
  sce_reduced <- fit_pseudotime(sce_reduced, clusters=c(1,2))
  return(plot_embedding(sce_reduced))
})

plot_grid(plotlist = embeddings_2[1:6])

plot_grid(plotlist = embeddings_2[7:12])

And see that the pseudotime fits are roughly constant:

set.seed(45)
folds <- createMultiFolds(1:ncol(sce), k = 2, times = 30)
all_embeddings <- lapply(folds, function(ind) {
    sce_tmp <- embeddr(sce[gene_indices,ind])#, genes_for_embedding = gene_indices)
    sce_tmp
  })

psts <- lapply(all_embeddings, function(sce_tmp) {
  sce_tmp_13 <- sce_tmp[,pData(sce_tmp)$cluster %in% c(1,2)]
  sce_tmp_13 <- fit_pseudotime(sce_tmp_13, clusters = c(1,2))
  sce_tmp_13$pseudotime
})

 corrs <- sapply(1:length(all_embeddings), function(i) {
  sce_i <- all_embeddings[[i]]
  cells_in_i <- pData(sce_i)$cell_id
  sce_23_red <- sce_23[, pData(sce_23)$cell_id %in% cells_in_i]
  abs(cor(psts[[i]], sce_23_red$pseudotime, method="spearman"))
})
qplot(corrs, geom='density') + theme_bw() +
  ggtitle('Pseudotime correlation for subsampled cells') +
  xlab('Correlation')

print(summary(corrs))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5405  0.9680  0.9762  0.9656  0.9804  0.9898

Subsampling the number of genes

##save(sce, gene_indices, file='~/oxford/embeddr//data/saved.Rdata')
folds <- createMultiFolds(1:length(gene_indices), k = 2, times = 30)
all_embeddings <- lapply(folds, function(ind) {
    #sce_tmp <- sce[gene_indices, ind]
    sce_tmp <- embeddr(sce[gene_indices[ind],])#, genes_for_embedding = gene_indices)
    sce_tmp
  })

#all_embeddings_unlist <- unlist(all_embeddings, recursive=FALSE)

psts <- lapply(all_embeddings, function(sce_tmp) {
  sce_tmp_13 <- sce_tmp[,pData(sce_tmp)$cluster %in% c(1,2)]
  sce_tmp_13 <- fit_pseudotime(sce_tmp_13, clusters = c(1,2))
  sce_tmp_13$pseudotime
})

 corrs <- sapply(1:length(all_embeddings), function(i) {
#   sce_i <- all_embeddings[[i]]
#   cells_in_i <- pData(sce_i)$cell_id
#   sce_23_red <- sce_23[, pData(sce_23)$cell_id %in% cells_in_i]
#   abs(cor(psts[[i]], sce_23_red$pseudotime))
   abs(cor(pseudotime(sce_23), psts[[i]], method="spearman"))
})
qplot(corrs, geom='density') + theme_bw() +
  ggtitle('Pseudotime correlation for subsampled genes') +
  xlab('Correlation')

print(summary(corrs))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5258  0.8586  0.9052  0.8787  0.9434  0.9704

Unravelling the contaminated cells

In the original monocle dataset, only a small number of cells were assigned to group 3 (‘interstitial mesenchymal cell’). However, our re-analysis suggests it is somewhat a larger with 72 cells in total. These cells were identified as mesenchymal cells as they expressed PDGFRA and SPHK1 in high abundances. We can look to see whether in our cell set we’ve identified more cells that are potentially contamination:

phenoData(sce)$monocle_classification <- mapvalues(pData(sce)$State, from=1:3, c('non-cont','non-cont','cont'))
phenoData(sce)$embeddr_classification <- mapvalues(pData(sce)$cluster, from=1:3, c('non-cont','cont','non-cont'))

print(kable(as.data.frame(table(dplyr::select(pData(sce), monocle_classification, embeddr_classification)))))
monocle_classification embeddr_classification Freq
non-cont non-cont 156
cont non-cont 43
non-cont cont 72
cont cont 0

So embeddr calls significatnly more cells as contaminated, while calls none as non-contaminated that monocle calls contaminated.

What we’d like to do is see how the gene markers compare when considering the contamined cells identified by monocle, the contaminated cells we identify, and all other cells:

agree_contamination <- apply(dplyr::select(pData(sce), monocle_classification, embeddr_classification), 
                             1, function(x) {
  if(x['monocle_classification'] == 'cont' & x['embeddr_classification'] == 'cont') {
     return('Agree contamination')
  } else if(x['monocle_classification'] != 'cont' & x['embeddr_classification'] == 'cont') {
    return('Embeddr only')
  }  else if(x['monocle_classification'] == 'cont') {
      return('Monocle only')
  } else {
    return('Agree no contamination')
  } 
})

cont_genes <- row.names(subset(fData(HSMM),  gene_short_name %in% c("PDGFRA", "SPHK1")))
short_names <- fData(HSMM)[cont_genes,]$gene_short_name
y <- exprs(sce[cont_genes,])
#y <- scale(t(y))
y <- data.frame(t(y))
y$agree_contamination <- agree_contamination
y <- melt(y, id.vars='agree_contamination', variable.name='gene', value.name='counts')
y$gene <- mapvalues(y$gene, from = cont_genes, to = short_names)

ggplot(y, aes(x=factor(agree_contamination), color=factor(agree_contamination), y=counts)) + 
  facet_wrap(~gene) + 
  geom_boxplot(outlier.shape=NA) + geom_jitter(alpha=0.5) +  theme_bw() + xlab('') +
  theme(axis.text.x = element_text(angle = -50, hjust=0)) + scale_color_fivethirtyeight(guide=FALSE) +
  ylab('log10(FPKM + 1) counts') + ggtitle('Expression of mesenchymal markers') 

So it looks like they might have missed a few cells expressing PDGFRA and SPHK1. We can also plot the markers in the embedded space, as per in the original paper:

cont_genes <- row.names(subset(fData(HSMM),  gene_short_name %in% c("CDK1", "MYOG", "PDGFRA", 
                                                                    "SPHK1", "MYF5", "NCAM1")))
short_names <- fData(HSMM)[cont_genes,]$gene_short_name
y <- exprs(sce[cont_genes,])
y <- t(y)
y <- data.frame(y)
M_y <- cbind(cluster = pData(sce)$cluster, redDim(sce), y)
M_y <- dplyr::rename(M_y, Cluster = cluster)
M_y_melted <- melt(M_y, id.vars=c('component_1','component_2','Cluster'), 
                   variable.name='gene', value.name='count')
M_y_melted$gene <- mapvalues(M_y_melted$gene, from = cont_genes, to = short_names)
ggplot(M_y_melted, aes(x=component_1, y=component_2, size=count, color=factor(Cluster))) + 
  geom_point(alpha=0.6) + facet_wrap(~ gene) + theme_bw() +
  scale_color_fivethirtyeight(name='Cluster') + scale_size_continuous(name='log10(FPKM)')  +
  xlab('Component 1') + ylab('Component 2')

Interestingly there appears to be a subset of cells that express both CDK1 and the contamination markers PDGFRA and SPHK1. Let’s look at those we’ve designated as contaminated and plot the expression:

cdk1_ind <- match('CDK1', fData(HSMM)$gene_short_name)
pdgfra_ind <- match('PDGFRA', fData(HSMM)$gene_short_name)
is_contaminated <- pData(sce)$cluster == 2

cx <- exprs(sce[c(cdk1_ind, pdgfra_ind), is_contaminated])
cx <- data.frame(t(cx))
names(cx) <- c('CDK1','PDGFRA')
ggplot(data=cx) + geom_point(aes(x=CDK1, y=PDGFRA)) +
  theme_minimal() + xlab('CDK1 expression log10(FPKM + 1)') +
  ylab('PDGFRA expression log10(FPKM + 1)') + 
  ggtitle('Coexpression of PDGFRA and CDK1 in contaminated cells')

Modelling genes in pseudotime

Model a single gene in pseudotime:

cdk1 <- row.names(subset(fData(HSMM),  gene_short_name == 'CDK1'))
#sce_23@lowerDetectionLimit <- log10(1 + 0.1)
model <- fit_pseudotime_model(sce_23, cdk1)
null_model <- fit_null_model(sce_23, cdk1)
p_val <- compare_models(model, null_model)
plot_pseudotime_model(sce_23[cdk1, ], model)

Now try all of them:

n_cells_expressed <- rowSums(is_exprs(sce_23))
keep_gene <- n_cells_expressed > 0.2 * dim(sce_23)[2]
sce_23_kept <- sce_23[keep_gene,]
# save(sce = sce_23_kept, file = "~/oxford/embeddr/data/sce_pst.Rdata")

## can optionally use pre-computed models in the pseudotime test function. 
## this is really handy as we can keep these around for predicting and plotting
## without having to compute them every time
diff_gene_test <- pseudotime_test(sce_23_kept, n_cores = 1)

And plot p-vals:

qplot(diff_gene_test$p_val, binwidth = 0.01) + theme_bw() + xlab('p-value') 

qplot(diff_gene_test$q_val, binwidth = 0.01) + theme_bw() + xlab('corrected p-value')

alpha <- 0.01
sig_genes <- diff_gene_test$gene[diff_gene_test$q_val < alpha]
sce_sig <- sce_23_kept[sig_genes,]

Identifying gene clusters in pseudotime

Use the predicted expression from the fitted models to cluster genes by pseudotemporal expression:

Now we can calulate the predicted expression matrix:

## let's get the predicted expression matrix from the models
pe <- predicted_expression(sce_sig)

And we can plot the correlation plot:

pcor <- cor(pe)
dist_cor <- 1 - pcor / 2 
hc <- hclust(as.dist(dist_cor))
plot(hc, labels=FALSE)

Now look at conserved modules:

n_cuts <- 4
gene_classes <- cutree(hc, n_cuts)
      
df_gene <- data.frame(gene=colnames(pe), class=gene_classes)

pe <- data.frame(scale(pe)) ## scaled pst-by-gene
pe$pseudotime <- pseudotime(sce_sig)
## save(pe, df_gene, file='~/pe.Rdata')

pe_melted <- melt(pe, id.vars='pseudotime', value.name='expression', variable.name='gene')
pe_melted <- inner_join(pe_melted, df_gene)

## we want to plot the mean expression rather than all of it (messy)
gp_groups <- group_by(pe_melted, class, pseudotime)
mean_expr_per_group <- dplyr::summarise(gp_groups, mean_expr = mean(expression))
pe_melted <- inner_join(pe_melted, mean_expr_per_group)
## pe_melted <- arrange(pe_melted, gene, pseudotime)

ggplot(pe_melted) + geom_line(aes(x=pseudotime, y=mean_expr), color='red') +
  facet_wrap(~ class) + stat_density2d(aes(x=pseudotime, y=expression), n=150) +
  theme_bw() + ylab('Expression') # add ncol = 1 for vertical representation

Number of genes in each class:

genes_per_class <- sapply(1:n_cuts, function(i) sum(gene_classes == i))
df_gpc <- data.frame(Class=1:n_cuts, 'Number in class' = genes_per_class)
print(kable(df_gpc, caption='Genes per class'))
Genes per class
Class Number.in.class
1 744
2 336
3 746
4 131

Now look at enriched GO terms for each class:

genes_per_class <- sapply(1:n_cuts, function(i) 1 * (df_gene$class == i))
gene_names <- df_gene$gene
all_genes <- featureNames(sce_23_kept)
gene_names <- sapply(as.character(gene_names), function(gn) strsplit(gn, '[.]')[[1]][1])
all_genes <- sapply(as.character(all_genes), function(gn) strsplit(gn, '[.]')[[1]][1])

genes_not_de <- setdiff(all_genes, gene_names)
genes_per_class <- rbind(genes_per_class, matrix(0, ncol=ncol(genes_per_class), nrow=length(genes_not_de)))

rownames(genes_per_class) <- c(gene_names, genes_not_de)

enriched_terms <- apply(genes_per_class, 2, function(gene_set) {
  pwf <- nullp(gene_set, 'hg19', 'ensGene', plot.fit=FALSE)
  go <- goseq(pwf, 'hg19', 'ensGene', test.cats=c('GO:BP'))
  go$log_qval <- log10(p.adjust(go$over_represented_pvalue, method='BH'))
  go <- dplyr::filter(go, log_qval < log10(0.01))
  go <- dplyr::select(go, category, log_qval, term)
  names(go) <- c('Category','log10 q-value','Term')
  return(go)
  })

reduced <- lapply(enriched_terms, head, 6)

Now print the terms:

for(i in 1:n_cuts) {
  print(kable(reduced[[i]], caption = paste('GO terms for cluster', i)))
}
GO terms for cluster 1
Category log10 q-value Term
GO:0000278 -38.16902 mitotic cell cycle
GO:0007049 -35.34295 cell cycle
GO:0000280 -33.65508 nuclear division
GO:1903047 -32.49645 mitotic cell cycle process
GO:0048285 -30.95800 organelle fission
GO:0022402 -30.77693 cell cycle process
GO terms for cluster 2
Category log10 q-value Term
GO:0006613 -12.651632 cotranslational protein targeting to membrane
GO:0045047 -12.651632 protein targeting to ER
GO:0072599 -12.465065 establishment of protein localization to endoplasmic reticulum
GO:0006614 -12.371712 SRP-dependent cotranslational protein targeting to membrane
GO:0070972 -11.144909 protein localization to endoplasmic reticulum
GO:0000184 -9.518562 nuclear-transcribed mRNA catabolic process, nonsense-mediated decay
GO terms for cluster 3
Category log10 q-value Term
GO:0061061 -23.73616 muscle structure development
GO:0007517 -21.88244 muscle organ development
GO:0003012 -20.73693 muscle system process
GO:0006936 -20.25709 muscle contraction
GO:0030049 -18.88609 muscle filament sliding
GO:0033275 -18.88609 actin-myosin filament sliding
GO terms for cluster 4
Category log10 q-value Term
GO:0051084 -4.637288 ‘de novo’ posttranslational protein folding
GO:0006458 -4.544402 ‘de novo’ protein folding
GO:0006457 -4.436593 protein folding